{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install -U pip\n",
    "! pip install -U clearml==0.16.2rc0\n",
    "! pip install -U pandas==1.0.4\n",
    "! pip install -U numpy==1.18.4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from collections import Counter\n",
    "\n",
    "from clearml import Task"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "task = Task.init(project_name='Tabular Example', task_name='tabular preprocessing')\n",
    "logger = task.get_logger()\n",
    "configuration_dict = {'data_task_id': '39fbf86fc4a341359ac6df4aa70ff91b',\n",
    "                      'fill_categorical_NA': True, 'fill_numerical_NA': True}\n",
    "configuration_dict = task.connect(configuration_dict)  # enabling configuration override by clearml\n",
    "print(configuration_dict)  # printing actual configuration (after override in remote mode)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_task = Task.get_task(configuration_dict.get('data_task_id'))\n",
    "train_set = data_task.artifacts['train_data'].get().drop(columns=['Unnamed: 0'])\n",
    "val_set = data_task.artifacts['val_data'].get().drop(columns=['Unnamed: 0'])\n",
    "logger.report_table(title='Trainset - raw',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# **Pre-processing**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove hour and year from DateTime data\n",
    "def change_time_format(data_frame):\n",
    "    timestamp = pd.to_datetime(data_frame['DateTime'])\n",
    "    months = [d.month for d in timestamp]\n",
    "    data_frame['Month'] = pd.DataFrame(months).astype('object')\n",
    "    data_frame.drop(columns= ['DateTime'], inplace=True)\n",
    "    return data_frame\n",
    "\n",
    "train_set = change_time_format(train_set)\n",
    "val_set = change_time_format(val_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def change_age_format(data_frame):    \n",
    "    age = data_frame['AgeuponOutcome']\n",
    "    months_age = []\n",
    "    for val in age:\n",
    "        if pd.isnull(val):\n",
    "            months_age.append(val)\n",
    "        else:\n",
    "            amount, time_type = val.split(' ')\n",
    "            if 'day' in time_type:\n",
    "                mult = 1./30\n",
    "            if 'week' in time_type:\n",
    "                mult = 1./4\n",
    "            if 'month' in time_type:\n",
    "                mult = 1.\n",
    "            if 'year' in time_type:\n",
    "                mult = 12.\n",
    "            months_age.append(int(amount) * mult)\n",
    "    data_frame['Age'] = pd.DataFrame(months_age).astype(np.float32)\n",
    "    data_frame.drop(columns= ['AgeuponOutcome'], inplace=True)\n",
    "    return data_frame\n",
    "    \n",
    "train_set = change_age_format(train_set)\n",
    "val_set = change_age_format(val_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def change_sex_format(data_frame): \n",
    "    sex_neutered = data_frame['SexuponOutcome']\n",
    "    sex = []\n",
    "    neutered = []\n",
    "    for val in sex_neutered:\n",
    "        if pd.isnull(val):\n",
    "            sex.append(val)\n",
    "            neutered.append(val)\n",
    "        elif 'Unknown' in val:\n",
    "            sex.append(np.nan)\n",
    "            neutered.append(np.nan)\n",
    "        else:\n",
    "            n, s = val.split(' ')\n",
    "            if n in ['Neutered', 'Spayed']:\n",
    "                neutered.append('Yes')\n",
    "            else:\n",
    "                neutered.append('No')\n",
    "            sex.append(s)\n",
    "\n",
    "    data_frame['Sex'] = pd.DataFrame(sex)\n",
    "    data_frame['Neutered'] = pd.DataFrame(neutered)\n",
    "    data_frame.drop(columns= ['SexuponOutcome'], inplace=True)\n",
    "    return data_frame\n",
    "\n",
    "train_set = change_sex_format(train_set)\n",
    "val_set = change_sex_format(val_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove irrelevant columns\n",
    "def remove_columns(data_frame, list_columns_names=None):\n",
    "    if list_columns_names is not None:\n",
    "        data_frame.drop(columns= list_columns_names, inplace=True)\n",
    "    return data_frame\n",
    "\n",
    "train_set = remove_columns(train_set, ['Name', 'OutcomeSubtype', 'AnimalID'])\n",
    "val_set = remove_columns(val_set, ['Name', 'OutcomeSubtype', 'AnimalID'])\n",
    "\n",
    "logger.report_table(title='Trainset - after preprocessing',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## *Fill NA Values*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "object_columns = train_set.select_dtypes(include=['object']).copy()\n",
    "numerical_columns = train_set.select_dtypes(include=['number']).copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if configuration_dict.get('fill_categorical_NA', True):\n",
    "    for col in object_columns.columns:\n",
    "        if object_columns[col].isnull().sum() > 0:\n",
    "            most_common = Counter(object_columns[col]).most_common(1)[0][0]\n",
    "            print('Column \"{}\": replacing null values with \"{}\"'.format(col, most_common))\n",
    "            train_set[col].fillna(most_common, inplace=True)\n",
    "            val_set[col].fillna(most_common, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if configuration_dict.get('fill_numerical_NA', True):\n",
    "    for col in numerical_columns.columns:\n",
    "        if numerical_columns[col].isnull().sum() > 0:\n",
    "            median_val = numerical_columns[col].median()\n",
    "            print('Column \"{}\": replacing null values with \"{}\"'.format(col, median_val))\n",
    "            train_set[col].fillna(median_val, inplace=True)\n",
    "            val_set[col].fillna(median_val, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop rows with NA values if were chosen not to be filled\n",
    "train_set.dropna(inplace=True)\n",
    "val_set.dropna(inplace=True)\n",
    "if configuration_dict.get('fill_categorical_NA', True) or configuration_dict.get('fill_numerical_NA', True):\n",
    "    logger.report_table(title='Trainset - after filling missing values',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## *Labels Encoding*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data = pd.concat([train_set, val_set])\n",
    "outcome_categories = all_data['OutcomeType'].astype('category').cat.categories\n",
    "outcome_dict = {key: val for val,key in enumerate(outcome_categories)}\n",
    "task.upload_artifact('Outcome dictionary', outcome_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for col in object_columns.columns:\n",
    "    all_data[col] = all_data[col].astype('category').cat.codes\n",
    "train_set = all_data.iloc[:len(train_set.index), :]\n",
    "val_set = all_data.iloc[len(train_set.index):, :]\n",
    "logger.report_table(title='Trainset - after labels encoding',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# making all variables categorical\n",
    "object_columns_names = object_columns.drop(columns= ['OutcomeType']).columns\n",
    "for col in object_columns_names:\n",
    "    all_data[col] = all_data[col].astype('category')\n",
    "columns_categries = {col: len(all_data[col].cat.categories) for col in object_columns_names}\n",
    "task.upload_artifact('Categries per column', columns_categries)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "task.upload_artifact('train_data', artifact_object=train_set)\n",
    "task.upload_artifact('val_data', artifact_object=val_set)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
